set -x

export TRITON_CACHE_DIR="/tmp/triton"

export NCCL_IB_TC=136
export NCCL_IB_SL=5
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=eth
export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5
export NCCL_IB_TIMEOUT=22
export NCCL_IB_QPS_PER_CONNECTION=8
export NCCL_MIN_NCHANNELS=4
export NCCL_NET_PLUGIN=none
export ACCL_C4_STATS_MODE=CONN
export ACCL_IB_SPLIT_DATA_NUM=4
export ACCL_IB_QPS_LOAD_BALANCE=1
export ACCL_IB_GID_INDEX_FIX=1
export ACCL_LOG_TIME=1


OUTPUT_DIR='work_dirs/pretrain_aliyun_dsw_test'

PY_ARGS=${PY_ARGS:-""}

if [ ! -d "$OUTPUT_DIR" ]; then
  mkdir -p "$OUTPUT_DIR"
fi
SCRIPT_NAME=$(basename "$0")
cp "$0" "${OUTPUT_DIR}/${SCRIPT_NAME}"

GPUS_PER_NODE=${GPUS_PER_NODE:-2}
export PYTHONPATH="/cpfs01/shared/public/caoweihan/projects/Liger-Kernel/src/":"$(pwd):$(pwd)/../"

HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 torchrun \
  --nproc-per-node=${GPUS_PER_NODE}  \
  fsdp2_pretrain_moe_interntrain.py \
  --train-cfg xx.py \
  --llm '/cpfs01/shared/public/caoweihan/projects/xtuner_internlm3_moe/xpuyu/internlm3_moe_lite_3l' \
  --work-dir ${OUTPUT_DIR} \
  --log-interval 2 \
  --seed 42 \
  --checkpoint-interval 100 \
  --max-keep-ckpts 2 \
  --resume \
  ${PY_ARGS} \
  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
